import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.graph_objects as go
import plotly.express as px
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def load_data():
data = pd.read_csv('Shopping-Mall-Customer-Data .csv')
return data
data = load_data()
data
| Customer ID | Age | Gender | Annual Income | Spending Score | |
|---|---|---|---|---|---|
| 0 | d410ea53-6661-42a9-ad3a-f554b05fd2a7 | 30 | Male | 151479 | 89 |
| 1 | 1770b26f-493f-46b6-837f-4237fb5a314e | 58 | Female | 185088 | 95 |
| 2 | e81aa8eb-1767-4b77-87ce-1620dc732c5e | 62 | Female | 70912 | 76 |
| 3 | 9795712a-ad19-47bf-8886-4f997d6046e3 | 23 | Male | 55460 | 57 |
| 4 | 64139426-2226-4cd6-bf09-91bce4b4db5e | 24 | Male | 153752 | 76 |
| ... | ... | ... | ... | ... | ... |
| 15074 | a0504768-a85f-4930-ac24-55bc8e4fec9e | 29 | Female | 97723 | 30 |
| 15075 | a08c4e0e-d1fe-48e7-9366-aab11ae409cd | 22 | Male | 73361 | 74 |
| 15076 | 0e87c25a-268c-401a-8ba1-7111dcde6f1a | 18 | Female | 112337 | 48 |
| 15077 | 5f388cbe-3373-4e16-b743-38f508f2249f | 26 | Female | 94312 | 5 |
| 15078 | b8b8f561-ebca-4401-8afe-544c906554ba | 19 | Male | 78045 | 2 |
15079 rows × 5 columns
def data_cleaning(data):
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)
return data
data = data_cleaning(data)
def perform_eda(data):
plt.figure(figsize=(10, 6))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()
#gender distribution
genders = data.Gender.value_counts()
# Set the style and background color
sns.set_style("white")
plt.figure(figsize=(10, 6))
sns.set_palette("pastel")
sns.barplot(x=genders.index, y=genders.values)
# Removing grid lines
sns.despine(left=True)
plt.xlabel("Gender")
plt.ylabel("Count")
plt.title("Gender Distribution")
plt.show()
#age analysis
plt.figure(figsize=(10, 6))
sns.set_style("white")
age18_25 = data.Age[(data.Age <= 25) & (data.Age >= 18)]
age26_35 = data.Age[(data.Age <= 35) & (data.Age >= 26)]
age36_45 = data.Age[(data.Age <= 45) & (data.Age >= 36)]
age46_55 = data.Age[(data.Age <= 55) & (data.Age >= 46)]
age56_65 = data.Age[(data.Age <= 65) & (data.Age >= 56)]
age_over65 = data.Age[data.Age >= 56]
x = ["18-25","26-35","36-45","46-55","56-65","65+"]
y = [len(age18_25.values),len(age26_35.values),len(age36_45.values)
,len(age46_55.values),len(age56_65.values),len(age_over65.values)]
#plt.figure(figsize=(15,6))
sns.barplot(x=x, y=y, palette="Blues")
plt.title("Number of Customer and Ages")
plt.xlabel("Age")
plt.ylabel("Number of Customer")
plt.show()
perform_eda(data)
def encode_categorical(data, categorical_cols):
existing_cols = [col for col in categorical_cols if col in data.columns]
if not existing_cols:
print("No categorical columns found for encoding.")
return data
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_data = encoder.fit_transform(data[categorical_cols])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))
data = data.drop(categorical_cols, axis=1).join(encoded_df)
print("Categorical columns encoded.")
return data
data = encode_categorical(data, categorical_cols=['Gender'])
Categorical columns encoded.
def know_your_features(data):
print(data.info())
print(data.describe())
print(data['Gender'].value_counts())
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
def feature_selection(data, feature_cols):
data = data[feature_cols]
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
print("Features selected and scaled.")
return data_scaled
data_scaled = feature_selection(data, feature_cols=['Age', 'Annual Income', 'Spending Score'])
Features selected and scaled.
def pre_clustering_visualization(data_scaled):
pca = PCA(n_components=2)
pca_data = pca.fit_transform(data_scaled)
tsne = TSNE(n_components=2, random_state=42)
tsne_data = tsne.fit_transform(data_scaled)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], ax=axes[0])
axes[0].set_title("PCA - Pre-clustering")
sns.scatterplot(x=tsne_data[:, 0], y=tsne_data[:, 1], ax=axes[1])
axes[1].set_title("t-SNE - Pre-clustering")
plt.show()
pre_clustering_visualization(data_scaled)
C:\Users\mkami\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
found 0 physical cores < 1
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
warnings.warn(
File "C:\Users\mkami\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
raise ValueError(f"found {cpu_count_physical} physical cores < 1")
def hierarchical_clustering(data_scaled):
linkage_matrix = linkage(data_scaled, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linkage_matrix)
plt.title("Dendrogram for Hierarchical Clustering")
plt.xlabel("Sample Index")
plt.ylabel("Distance")
plt.show()
# Select k based on dendrogram
k = 6
clusters = fcluster(linkage_matrix, k, criterion='maxclust')
print(f"Clusters formed with k={k}")
return clusters, k
clusters, k = hierarchical_clustering(data_scaled)
Clusters formed with k=6
def post_clustering_visualization(data_scaled, clusters):
pca = PCA(n_components=2)
pca_data = pca.fit_transform(data_scaled)
tsne = TSNE(n_components=2, random_state=42)
tsne_data = tsne.fit_transform(data_scaled)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.scatterplot(x=pca_data[:, 0], y=pca_data[:, 1], hue=clusters, palette="viridis", ax=axes[0])
axes[0].set_title("PCA - Post-clustering")
sns.scatterplot(x=tsne_data[:, 0], y=tsne_data[:, 1], hue=clusters, palette="viridis", ax=axes[1])
axes[1].set_title("t-SNE - Post-clustering")
plt.show()
post_clustering_visualization(data_scaled, clusters)
C:\Users\mkami\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning: Could not find the number of physical cores for the following reason:
found 0 physical cores < 1
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
warnings.warn(
File "C:\Users\mkami\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
raise ValueError(f"found {cpu_count_physical} physical cores < 1")
#pip install plotly
def radar_plot(data, clusters):
data['Cluster'] = clusters
cluster_means = data.groupby('Cluster').mean()
scaler = StandardScaler()
cluster_means_scaled = scaler.fit_transform(cluster_means)
cluster_means_scaled_df = pd.DataFrame(cluster_means_scaled, columns=cluster_means.columns)
categories = cluster_means.columns
fig = go.Figure()
for i in range(cluster_means_scaled_df.shape[0]):
fig.add_trace(go.Scatterpolar(
r=cluster_means_scaled_df.iloc[i].values,
theta=categories,
fill='toself',
name=f"Cluster {i+1}"
))
fig.update_layout(
polar=dict(
radialaxis=dict(visible=True, range=[-2, 2])
),
title="Cluster Radar Plot",
showlegend=True
)
fig.show()
radar_plot(data, clusters)
distortions = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k)
kmeanModel.fit(data_scaled)
distortions.append(kmeanModel.inertia_)
# Plot elbow curve
plt.figure(figsize=(8, 6))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Method to find optimal k')
plt.show()
def compare_kmeans_hierarchical(data_scaled, k_hier=6, k_kmeans=6):
# Hierarchical Clustering
linkage_matrix = linkage(data_scaled, method='ward')
hier_clusters = fcluster(linkage_matrix, k_hier, criterion='maxclust')
hier_silhouette = silhouette_score(data_scaled, hier_clusters)
print(f"Hierarchical Clustering Silhouette Score: {hier_silhouette:.2f}")
# K-Means Clustering
kmeans = KMeans(n_clusters=k_kmeans, random_state=42)
kmeans_labels = kmeans.fit_predict(data_scaled)
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)
print(f"K-Means Clustering Silhouette Score: {kmeans_silhouette:.2f}")
# PCA Visualization for K-Means Clustering
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_scaled)
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans_labels, cmap='viridis', s=50)
plt.title("K-Means Clustering with PCA")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
# t-SNE Visualization for K-Means Clustering
tsne = TSNE(n_components=2, random_state=42)
data_tsne = tsne.fit_transform(data_scaled)
plt.subplot(1, 2, 2)
plt.scatter(data_tsne[:, 0], data_tsne[:, 1], c=kmeans_labels, cmap='viridis', s=50)
plt.title("K-Means Clustering with t-SNE")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()
# Radar Chart for K-Means Clustering
radar_labels = ['Age', 'Annual Income', 'Spending Score', 'Component 4', 'Component 5']
cluster_centers = kmeans.cluster_centers_[:, :len(radar_labels)]
fig = go.Figure()
for i in range(k_kmeans):
fig.add_trace(go.Scatterpolar(
r=cluster_centers[i],
theta=radar_labels,
fill='toself',
name=f'Cluster {i+1}'
))
fig.update_layout(
polar=dict(radialaxis=dict(visible=True, range=[0, np.max(cluster_centers)])),
title="K-Means Cluster Centers Radar Chart"
)
fig.show()
print("Comparison Summary:")
print(f"Silhouette Score - Hierarchical Clustering: {hier_silhouette:.2f}")
print(f"Silhouette Score - K-Means Clustering: {kmeans_silhouette:.2f}")
print("Differences: Hierarchical clustering may capture different patterns due to merging approach, while K-means directly assigns points to clusters.")
compare_kmeans_hierarchical(data_scaled)
Hierarchical Clustering Silhouette Score: 0.21 K-Means Clustering Silhouette Score: 0.29
Comparison Summary: Silhouette Score - Hierarchical Clustering: 0.21 Silhouette Score - K-Means Clustering: 0.29 Differences: Hierarchical clustering may capture different patterns due to merging approach, while K-means directly assigns points to clusters.